home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power Programmierung
/
Power-Programmierung (Tewi)(1994).iso
/
qtawk
/
soundx4.exp
< prev
next >
Wrap
Text File
|
1990-04-23
|
3KB
|
87 lines
# QTAwk Soundix Algorithm
#
# Optimized Soundix Algorithm. Adapted for QTAwk from article in:
# "The C Gazette", Vol. 4, No. 2, Autumn 1989, page 29, by Joe Celko
#
BEGIN {
sl = 4; # set code length
}
{
for ( i = 1 ; i <= NF ; i++ ) printf(" Result: %s ==> %s\n",$i,soundix4($i,sl));
}
function soundix4(inname,len) {
local workbuf;
local leading_letter;
#* make a working copy
workbuf = strupr(inname);
#* convert all vowels to 'A'
gsub(/[AEIOUY]/,'A',workbuf);
#* prefix transformations: done only once on the front of a name
sub(/^MAC/,"MCC",workbuf); # MAC -> MCC
sub(/^KN/ ,"NN" ,workbuf); # KN -> NN
sub(/^PF/ ,"FF" ,workbuf); # PF -> FF
sub(/^SCH/,"SSS",workbuf); # SCH -> SSS
sub(/^K/ ,'C' ,workbuf); # K -> C
#* preserve leading letter
leading_letter = substr(workbuf,1,1);
workbuf = substr(workbuf,2);
#* infix transformations: done after the first letter
#* and are from left to right on the name
gsub(/DG/ ,"GG" ,workbuf); # DG -> GG
gsub(/CAAN/,"TAAN",workbuf); # CAAN -> TAAN
gsub(/D/ ,'T' ,workbuf); # D -> T
gsub(/NST/ ,"NSS" ,workbuf); # NST -> NSS
gsub(/AV/ ,"AF" ,workbuf); # AV -> AF
gsub(/Q/ ,'G' ,workbuf); # Q -> G
gsub(/Z/ ,'S' ,workbuf); # Z -> S
gsub(/M/ ,'N' ,workbuf); # M -> N
gsub(/KN/ ,"NN" ,workbuf); # KN -> NN
gsub(/K/ ,'C' ,workbuf); # K -> C
gsub(/AH/ ,"AA" ,workbuf); # AH -> AA
gsub(/HA/ ,"AA" ,workbuf); # HA -> AA
gsub(/AW/ ,"AA" ,workbuf); # AW -> AA
gsub(/PH/ ,"FF" ,workbuf); # PH -> FF
gsub(/SCH/ ,"SSS" ,workbuf); # SCH -> SSS
#* suffix transformations: done on the end of the word going right to left
#* (1) remove terminal A's and S's
sub(/[AS]+$/,"",workbuf);
#* (2) terminal NT-> TT
sub(/NT$/,"TT",workbuf);
#* now strip out all vowels except the first - remember that all vowels
#* were transformed to 'A' earlier
gsub(/A/,"",workbuf);
#* remove all duplicate letters.
#* Note this is different from the Soundex3 duplicate cleanup because
#* the letter transforms can create duplicates at the front of the
#* output name
gsub(/B+/,'B',workbuf);
gsub(/C+/,'C',workbuf);
gsub(/F+/,'F',workbuf);
gsub(/H+/,'H',workbuf);
gsub(/G+/,'G',workbuf);
gsub(/J+/,'J',workbuf);
gsub(/L+/,'L',workbuf);
gsub(/N+/,'N',workbuf);
gsub(/P+/,'P',workbuf);
gsub(/R+/,'R',workbuf);
gsub(/S+/,'S',workbuf);
gsub(/T+/,'T',workbuf);
gsub(/V+/,'V',workbuf);
gsub(/W+/,'W',workbuf);
gsub(/X+/,'X',workbuf);
#* return proper length code
return leading_letter ∩ (len ? substr(workbuf,1,len - 1) : workbuf);
}